import requests
import re
url = 'https://36kr.com'
headers = {
    'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1',
    "cookie": "acw_tc=2760825e16045423406356537e46dceabadfac2cfa678509c2942197b967b9; acw_sc__v2=5fa35f8494c90bf041c06e05ea9b4486a9c8e9d0; sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22175962de7d86-0f88e0f0f5f1f5-326a7207-1764000-175962de7d9bd4%22%2C%22%24device_id%22%3A%22175962de7d86-0f88e0f0f5f1f5-326a7207-1764000-175962de7d9bd4%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; Hm_lvt_1684191ccae0314c6254306a8333d090=1604401424,1604481306,1604539542; Hm_lvt_713123c60a0e86982326bae1a51083e1=1604401424,1604481306,1604539542; Hm_lpvt_1684191ccae0314c6254306a8333d090=1604542405; SERVERID=6eb0a1872728d69c244094a636b7db3b|1604542404|1604542343; Hm_lpvt_713123c60a0e86982326bae1a51083e1=1604542405"
}
r = requests.get(url=url, headers=headers)

# 2. 得到响应的内容
# r.encoding = "utf-8"
html = r.text
# html = r.content.decode()
# 测试提取的页面数据
print(html)

# 3. 通过正则表达式提取需要的数据
# 下面的div标签是通过浏览器赋值过来的，这样可以先使用re模块对如下的标签进行测试，如果提取数据成功的话，再将下面代码屏蔽
# 从而用requests下载的代码
# item_list = re.findall(r'<span class="item-title weight-bold ellipsis-2">(.*?)</span>', html)
item_list = re.findall(r'<a class="item-info clearfloat" href="([^"]*).*?ellipsis-2">(.*?)</span>.*?</a>', html)

print("提取到的新闻个数:", len(item_list))

for temp in item_list:
    # print(temp)
    # print()
    # print("https://36kr.com" + temp[0], temp[1])
    print(url + temp[0], temp[1])